Since our drive dataset is too small in order to build a classifier on it, we need to extend the set.
We collected a lot of timerelated data, which we will not have in real world application of this classifier. Based on that timerelated data, we want to predict whether a article is evergreen or not. This may be evaluated by the views in the days after publication.
Steps:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# load labels
labels = pd.read_json('project_3_labels.json')
# load dataframe with all including content informations
df_content = pd.read_csv('EDA_Evergreen.csv')
#loading timeelements
df_pageviews = pd.read_csv('evergreen_pagereads')
# load wordbags
data_dtm_body = pd.read_csv('data_dtm_title.csv')
data_dtm_title = pd.read_csv('data_dtm_body.csv')
del df_content['Unnamed: 0']
del df_content['id']
del df_content['text']
del df_content['meta.weeks_with_more_than_50_clicks']
# create one df based on content and pageviews
df = df_pageviews.join(df_content.set_index('article_drive_id')[['meta.publisher','label_id','label_text',
'title','text_content','topic',
'locality','newstype','genre']], on='article_drive_id')
# drop duplicates generated by join
df.drop_duplicates(subset=['article_drive_id', 'days_past', 'views_for_certain_day'],inplace= True)
# deleting saisonal articles
df = df.loc[df['label_id']!=173]
# save all drive_id's for iterating later
unique_drive_id = set(df['article_drive_id'])
# extracting evergreentypes into own df's
for i in range(len(unique_drive_id)):
df_non_ev = df.loc[df['label_id'] == 175]
df_event_ev = df.loc[df['label_id'] == 174]
df_zeitlos_ev = df.loc[df['label_id'] == 172]
Since some publishers and articles are more famous than others and also that evergreens have a high decrease in views in the first days, we are normalizing based on the given days.
Thus, we can highlight the behavior in the first days and differ between evergreens and non evergreens. Note that normalization function such as Softmax or Symmetric normalization ruin structures of the evergreen article due to the high amount of nonevergreen articles
We take a normalization function. This function takes the first x days to calculate its average and than normalize the past days based on that.
# generiert die Vektorlänge bzw. die Anzahl der Tage
def hidden_dims(days:int, df:pd.DataFrame):
try:
df['days_past']
df['article_drive_id']
df['views_for_certain_day']
except:
print("The given Dataset has no column named 'days_past', 'article_drive_id' or 'views_for_certain_day'")
raise
return df.loc[df.days_past < days]
# take the average and normalize the past days based on that average
def normalizer(normalize_value:int, past_days:list):
# starting with the first day by 1.0
normalized_list = [(normalize_value/normalize_value)]
#normalize all values in the list based on the average
past_days_normalized = [x / normalize_value for x in past_days]
normalized_list = normalized_list+past_days_normalized
days_past = list(range(0,len(normalized_list)))
return normalized_list, days_past
# here we generate the average on the given days
def normalize_views(days:int, average_of_days:int, df:pd.DataFrame):
# throw exception if averagedays are bigger than days
if days <= average_of_days:
raise Exception('the given values are invalid')
# get the relevant days we need to consider
relevant_days = days+average_of_days
# get the relevant rows and just work with that
df = hidden_dims((relevant_days), df)
df_av = pd.DataFrame({'article_drive_id':[],
'views_for_certain_day_normalized':[],
'meta.publisher':[]})
days_past_list = []
unique_drive_id = list(set(df['article_drive_id']))
for i in range(len(unique_drive_id)):
#get all the information of the i.- article
i_article = df.loc[df.article_drive_id == unique_drive_id[i]]
#filter all irrelevant days out
i_article = i_article.loc[i_article.days_past < (average_of_days+days)]
#calculate the views of all given days in average_days
average_views = i_article.loc[i_article.days_past < average_of_days].views_for_certain_day.mean()
#get all days after the average calculated days
past_average_views = i_article.loc[i_article.days_past >= average_of_days].views_for_certain_day
#normalize the values
normalized_list, days_past = normalizer(average_views, list(past_average_views))
days_past_list.extend(days_past)
#get publisher of corresponding article
publisher = list(df.loc[df['article_drive_id'] == unique_drive_id[i]]['meta.publisher'])[0]
df_av = df_av.append({'views_for_certain_day_normalized': normalized_list,
'article_drive_id': unique_drive_id[i],
'meta.publisher': publisher}, ignore_index=True)
# unnest lists
df_av = df_av.explode(column='views_for_certain_day_normalized')
#days past from new range we are considering
df_av['days_past'] = days_past_list
return df_av
# look at the first 79 days
days_count_norm = 80
df_non_ev_norm = normalize_views(days_count_norm, 5, df_non_ev)
df_event_ev_norm = normalize_views(days_count_norm, 5, df_event_ev)
df_zeitlos_ev_norm = normalize_views(days_count_norm, 5, df_zeitlos_ev)
#saving clicks of each evergreentype to a list
non_ev_clicks = []
event_ev_clicks = []
zeitlos_ev_clicks = []
# for iterating thru 181 days
day_count = set(df['days_past'])
# take median to avoid outliners
for i in range(days_count_norm):
non_ev_total = df_non_ev_norm.loc[df_non_ev_norm['days_past'] == i]['views_for_certain_day_normalized']
non_ev_clicks.append(non_ev_total.median())
event_ev_total = df_event_ev_norm.loc[df_event_ev_norm['days_past'] == i]['views_for_certain_day_normalized']
event_ev_clicks.append(event_ev_total.median())
zeitlos_ev_total = df_zeitlos_ev_norm.loc[df_zeitlos_ev_norm['days_past'] == i]['views_for_certain_day_normalized']
zeitlos_ev_clicks.append(zeitlos_ev_total.median())
# becaus event_ev_article nearly 0 in the future
event_ev_clicks = [0 if x != x else x for x in event_ev_clicks]
#plotting results
evergreen_click_types = pd.DataFrame(list(zip(non_ev_clicks, event_ev_clicks, zeitlos_ev_clicks)),
columns =['Non-Evergreen', 'Event-Evergreen', 'Zeitlos-Evergreen'])
fig = px.line(evergreen_click_types)
fig.update_layout(title='Average clicks of each evergreen type based on the 80 Days (with normalization)',
xaxis_title='Days',
yaxis_title='Clicks')
fig.show()
It is noticeable that both nonevergreen as well as evergreen articles have significant popularity differences between the first four days and the days after. However, the decrease of nonevergreen pageviews are way heavier than Evergreen articles. Thus, we can consider Evergreen articles as more consistent when it comes to pageviews over time.
from scipy import spatial
def cosine_similarity(vector1, vector2):
return (1 - spatial.distance.cosine(vector1, vector2))*100
# convert to vectors
non_ev_curve = np.array(non_ev_clicks)
event_ev_curve = np.array(event_ev_clicks)
zeitlos_ev_curve = np.array(zeitlos_ev_clicks)
# now lets check if our assumption is correct.
print('Similarity between:\nevent_ev and non_ev_curve', cosine_similarity(event_ev_curve, non_ev_curve), '%')
print('zeitlos_ev and non_ev', cosine_similarity(zeitlos_ev_curve, non_ev_curve), '%')
print('event_ev and non_ev', cosine_similarity(zeitlos_ev_curve, event_ev_curve), '%')
Similarity between: event_ev and non_ev_curve 55.87117952118589 % zeitlos_ev and non_ev 73.00445690960117 % event_ev and non_ev 91.1965602662992 %
Using the fullbatch we can observe that event_evs and zeitlos_evs are pretty similar, as expected. ~91% similarity
# combine all to one df
df_norm = df_non_ev_norm
df_norm = df_norm.append(df_event_ev_norm, ignore_index=True)
df_norm = df_norm.append(df_zeitlos_ev_norm, ignore_index=True)
#drop nans
df_norm = df_norm[~df_norm.views_for_certain_day_normalized.isna()]
there are some article which have not gained views until 80 days. Thus we pad them with their median value.
vectors = []
article_drive_id = []
vector_length = []
# get all views as vectors
for i in set(df_norm.article_drive_id):
# get the vector as list
list_vector = df_norm.loc[df_norm.article_drive_id == i].views_for_certain_day_normalized
list_length = int(df_norm.loc[df_norm.article_drive_id == i].days_past[-1:])
#pad with median if vector is not 81
if list_length < 80:
pad_len = 79-list_length
list_vector = list_vector.append(pd.Series(pad_len*[list_vector.median()]))
# if vector is too big
else:
list_vector=list_vector[:80]
article_drive_id.append(i)
vectors.append(np.array(np.float64(list_vector)))
vector_length.append(list_length)
df_vector = pd.DataFrame({'article_drive_id': article_drive_id,'vector_views':vectors,
'vector_length': vector_length})
<ipython-input-17-130e1208e8a9>:14: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
# get the labels
df_vector = df_vector.join(df.set_index('article_drive_id')[['label_id', 'label_text']], on='article_drive_id')
df_vector.drop_duplicates(subset=['article_drive_id'],inplace= True)
# example for just one article
# similarity between nonevergreen article and nonev curve and evergreencurve
def curve_similarity(vector):
non_sim = cosine_similarity(vector,non_ev_curve)
zt_sim = cosine_similarity(vector,zeitlos_ev_curve)
event_sim = cosine_similarity(vector,event_ev_curve)
return non_sim, zt_sim, event_sim
liste = df_vector.vector_views.map(curve_similarity)
#initialize similarities on df
df_vector['NonEv_similarity'] = [i[0] for i in liste]
df_vector['ZtEv_similarity'] = [i[1] for i in liste]
df_vector['EventEv_similarity'] = [i[2] for i in liste]
df_vector
| article_drive_id | vector_views | vector_length | label_id | label_text | NonEv_similarity | ZtEv_similarity | EventEv_similarity | |
|---|---|---|---|---|---|---|---|---|
| 0 | bae8b1ee4a4933e81c5de018f3c351e9 | [1.0, 0.04873294346978557, 0.09421702404158544... | 69 | 175 | kein Evergreen | 85.240634 | 80.008094 | 72.873027 |
| 1 | d1ed29d4eb0a677add573451a5f76c9e | [1.0, 0.023765240752221535, 0.0165323413928497... | 80 | 175 | kein Evergreen | 95.938743 | 61.209721 | 43.940695 |
| 2 | e91cdb7d6dec09dd27d7f451a75c7817 | [1.0, 63.6, 18.2, 4.2, 6.8, 11.2, 17.8, 11.6, ... | 32 | 175 | kein Evergreen | 27.984508 | 58.071764 | 57.437826 |
| 3 | 3de9aba18abe8ba58de6e98751d78b30 | [1.0, 2.5644171779141103, 1.5030674846625767, ... | 80 | 175 | kein Evergreen | 45.296417 | 76.042452 | 75.168649 |
| 4 | 23580d0b456b93933fce8ac445a3b38c | [1.0, 0.136986301369863, 0.0684931506849315, 0... | 9 | 175 | kein Evergreen | 94.974894 | 86.381776 | 75.763497 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 306 | 6a0c451be62ab70abda6cb3e92fa8770 | [1.0, 0.15087956698240865, 0.12652232746955344... | 80 | 175 | kein Evergreen | 98.862760 | 67.000969 | 49.141783 |
| 307 | 7e54f3ab31d7acfec177db9820b04294 | [1.0, 0.020375896316750543, 0.0163675232708324... | 78 | 175 | kein Evergreen | 95.303792 | 54.719883 | 36.523678 |
| 308 | 67af006706fc1da769ef3b50d360a3d0 | [1.0, 0.11725293132328309, 0.13759272553242402... | 71 | 175 | kein Evergreen | 94.719253 | 64.958274 | 45.762123 |
| 309 | 93dbec9597f55a6617cdbf7ffd4e046f | [1.0, 0.013753702920016927, 0.0116377486246297... | 79 | 175 | kein Evergreen | 95.656082 | 58.900453 | 42.325079 |
| 310 | be27bab7f0a8444bf03e2d19369e8b3c | [1.0, 0.2222222222222222, 0.09401709401709402,... | 80 | 175 | kein Evergreen | 50.350162 | 77.187094 | 84.443639 |
311 rows × 8 columns
# only work with data which has atleast views until day 66
#df_vector = df_vector.loc[df_vector.vector_length > 65]
# get me all smilarities for nonevs
zt_fails = df_vector.loc[df_vector.label_id==172]
# get all fails which are too similar to nonev (90% or more)
zt_fails = zt_fails.loc[zt_fails.NonEv_similarity >90].vector_views
zt_fails.reset_index(drop=True, inplace=True)
ev_fails = df_vector.loc[df_vector.label_id==174]
ev_fails = ev_fails.loc[ev_fails.NonEv_similarity >90].vector_views
ev_fails.reset_index(drop=True, inplace=True)
example_curves = pd.DataFrame(list(zip(zt_fails[1], zt_fails[2], zt_fails[3],ev_fails[0], ev_fails[1])))
fig = px.line(example_curves)
fig.update_layout(title='Evergreens behaving like non-evergreens',
xaxis_title='Days',
yaxis_title='Clicks')
fig.show()
non_fails = df_vector.loc[df_vector.label_id==175]
non_fails = non_fails.loc[non_fails.ZtEv_similarity >90].vector_views
non_fails.reset_index(drop=True, inplace=True)
example_curves = pd.DataFrame(list(zip(non_fails[0], non_fails[1], non_fails[2])))
fig = px.line(example_curves)
fig.update_layout(title='NonEvergreens behaving like Evergreens',
xaxis_title='Days',
yaxis_title='Clicks')
fig.show()
non_fails[0]
array([1. , 0.125 , 0.25 , 0.25 , 0.125 , 0.125 , 0.125 , 0.375 ,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875,
0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875, 0.1875])
This approach would have been a great classifier since it has the most reliable performance and indeed the best accuracy.
However there are still some issues to be considered if we want to use it as a data extension: